print(Sys.time())

options(stringsAsFactors=F)

suppressPackageStartupMessages(library(devtools))
suppressPackageStartupMessages(library(stm))
suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(tm))
suppressPackageStartupMessages(library(Matrix))
suppressPackageStartupMessages(library(RSpectra))

## install_github("wilryh/parrot", dependencies=TRUE)
suppressPackageStartupMessages(library(parrot))

source("code/calc_mi.R")

tweets_or_descriptions <- "tweets"

#### #### Load and prep data
#### ####
load(
    file=paste0(
        "data/russia_",
        tweets_or_descriptions,
        "_meta_tdm_election16_replicate_psrm_low_thresh0.RData"
    )
)

load("data/labeled_troll_tweets_replicate_psrm.RData")
names(ira_labels)[1] <- "tweetid"

## join on tweet ID (user IDs hashed in Twitter data)
## Twitter did not respond to request for unhashed data
meta_election16 <- left_join(
    meta_election16,
    ira_labels %>%
    rename(account_category_by_tweet = account_category),
    by="tweetid"
)

nrow(meta_election16)
length(unique(meta_election16$userid))

uniqueUserIDs <- function(.category) {
    return(
        unique(
            subset(
                meta_election16,
                !is.na(account_category_by_tweet)
                & account_category_by_tweet==.category
            )$userid
        )
    )
}

all_trolls <- unique(subset(meta_election16, !is.na(account_category_by_tweet))$userid)

right_trolls <- uniqueUserIDs(.category="RightTroll")
left_trolls <- uniqueUserIDs(.category="LeftTroll")
hashtag_trolls <- uniqueUserIDs(.category="HashtagGamer")
news_trolls <- uniqueUserIDs(.category="NewsFeed")
commercial_trolls <- uniqueUserIDs(.category="Commercial")
nonenglish_trolls <- uniqueUserIDs(.category="NonEnglish")
fearmonger_trolls <- uniqueUserIDs(.category="Fearmonger")
unknown_trolls <- uniqueUserIDs(.category="Unknown")
## other_trolls <- all_trolls[!(all_trolls %in% c(right_trolls, left_trolls, hashtag_trolls, news_trolls, commercial_trolls, nonenglish_trolls))]

troll_account_labels <- data.frame(
    userid=c(
        right_trolls,
        left_trolls,
        hashtag_trolls,
        news_trolls,
        commercial_trolls,
        nonenglish_trolls,
        ## other_trolls,
        unknown_trolls,
        fearmonger_trolls
    ),
    account_category=c(
        rep("RightTroll", length(right_trolls)),
        rep("LeftTroll", length(left_trolls)),
        rep("HashtagGamer", length(hashtag_trolls)),
        rep("NewsFeed", length(news_trolls)),
        rep("Commercial", length(commercial_trolls)),
        rep("NonEnglish", length(nonenglish_trolls)),
        ## rep("other_troll", length(other_trolls)),
        rep("Unknown", length(unknown_trolls)),
        rep("Fearmonger", length(fearmonger_trolls))
    )
)

meta_election16 <- left_join(
    meta_election16,
    troll_account_labels, by="userid"
)

meta_election16$not_english <- (
    unname(
        sapply(
            meta_election16$user_profile_description,
            function(x) any(utf8ToInt(x) %in% 1000:1999)
        )
    ) |
    meta_election16$account_language!="en"
) & (
    !(meta_election16$userid %in% all_trolls) | meta_election16$userid %in% nonenglish_trolls
)
all_nonenglish_trolls <- unique(subset(meta_election16, not_english)$userid)
## meta_election16$not_2016 <- !(substr(meta_election16$tweet_time, 1, 10)>="2016-01-01") | !(substr(meta_election16$tweet_time, 1, 10)<="2016-11-08")

#### Report filtering
####
cat("\nSum not English:", sum(meta_election16$not_english), "\n")
write.table(
    data.frame(
        variable="!(english meta_election16$not_english)",
        value=sum(!(meta_election16$not_english))
    ),
    file="logs/process_russian_account_tweets_replicate_psrm.log",
    append=T,
    sep=":", col.names=F, row.names=F
)
write.table(
    data.frame(
        variable="length(unique(subset(meta_election16, !not_english)$userid))",
        value=length(unique(subset(meta_election16, !not_english)$userid))
    ),
    file="logs/process_russian_account_tweets_replicate_psrm.log",
    append=T,
    sep=":", col.names=F, row.names=F
)
length(unique(subset(meta_election16, !not_english & userid %in% troll_account_labels$userid)$userid))
nrow(subset(meta_election16, !not_english & tweetid %in% ira_labels$tweetid))


save(
    ## troll_account_labels,
    meta_election16,
    file="data/labeled_troll_accounts_meta_replicate_psrm.RData"
)

start_dates <- c(min(substr(meta_election16$tweet_time, 1, 10)), "2016-01-01")
end_dates <- c("2016-11-08", "2016-11-08")


#### #### Text scaling and mutual information
#### ####

source("code/scale_text_simple.R")

remove_news <- TRUE

## just 2016 or not
for (i in 1:length(start_dates)) {#dates
    ## ## remove news or not
    ## for (remove_news in c(TRUE)) {
    ## all of the paper results removed news spam
    ## loop through each cluster or not
    for (the_cluster in c("all","LeftTroll","RightTroll","HashtagGamer","NewsFeed")) {
        
        start_date <- start_dates[i]
        end_date <- end_dates[i]
        
        print(the_cluster)
        print(start_date)
        print(end_date)
        
        ## Filtering
        ## 
        meta_election16$holdout <- meta_election16$not_english |
            !(substr(meta_election16$tweet_time, 1, 10)>=start_date) |
            !(substr(meta_election16$tweet_time, 1, 10)<=end_date)
        
        
        if (remove_news & the_cluster != "NewsFeed") {
            meta_election16$holdout <- meta_election16$holdout |
                meta_election16$userid %in% news_trolls
            if (start_date < "2016-01-01") {
                meta_election16$holdout <- meta_election16$holdout |
                    meta_election16$userid %in% commercial_trolls
            }
        }
        
        if (the_cluster != "all") {
            these_trolls <- unique(
                subset(
                    meta_election16,
                    !is.na(account_category) & account_category==the_cluster
                )$userid
            )
            meta_election16$same_holdout_all_trolls <- meta_election16$holdout
            meta_election16$holdout <- meta_election16$holdout |
                !(meta_election16$userid %in% all_trolls) |
                !(meta_election16$userid %in% these_trolls)
        }
        ## meta_election16$holdout <- meta_election16$holdout | meta_election16$$tweetid %in% ira_labels$tweetid #do not train on mturk
        
        meta_election16$not_holdout <- !meta_election16$holdout
        
        
        ## Scale text
        ## 
        if (the_cluster == "all" | start_dates[i] >= "2016-01-01")  {
            
            system.time(
                scores_election16 <- scale_text(
                    meta = meta_election16,
                    tdm = tdm_election16,
                    compress_fast = TRUE,
                    constrain_outliers = F,
                    simple=T,
                    holdout=meta_election16$holdout,
                    approx=T
                )
            )
            
            document_scores_election16 <- score_documents(
                scores = scores_election16,
                n_dimensions = ifelse(
                    the_cluster=="all",
                    100,
                    10
                ),
                doc_length_correction = 1
            )
            
            
            
            just_scores <- scores_election16
            just_scores$meta <- NULL
            just_scores$tdm <- NULL
            just_scores$tdm_orig <- NULL
            just_scores$word_scores <- just_scores$word_scores[,1:10]
            just_scores$pivot_scores <- just_scores$pivot_scores[,1:10]
            just_scores$sum_pivot_scores <- sqrt(rowSums(just_scores$pivot_scores[,-1]^2))
            
            save(
                just_scores,
                file=paste0(
                    "data/en_",tweets_or_descriptions,"_",
                    start_date,"_to_", end_date, "_dimensions_election16",
                    ifelse(remove_news, "_remove_news", ""), "_",
                    the_cluster,"_scores_only_replicate_psrm.RData"
                )
            )
            
            the_keywords <- get_keywords(
                scores_election16, n_dimensions=0:5,
                n_words=40, capture_output=T
            )
            
            ## for keywords spit across multiple dimensions (here, left trolls)
            ## manually combined
            scores_election16_alt_plus <- scores_election16
            scores_election16_alt_plus$pivot_scores[,1] <- scores_election16_alt_plus$pivot_scores[,2] +
                scores_election16_alt_plus$pivot_scores[,3]
            the_keywords_alt_plus <- get_keywords(
                scores_election16_alt_plus, n_dimensions=0,
                n_words=40, capture_output=T
            )$D0
            
            document_scores_election16 <- subset(document_scores_election16, !not_english)
            
            if (the_cluster != "all") {
                document_scores_election16 <- document_scores_election16[,c("userid","tweetid",paste0("X",0:9))]
            }
            
            save(
                the_keywords, the_keywords_alt_plus,
                document_scores_election16,
                file=paste0(
                    "data/en_",tweets_or_descriptions,"_",
                    start_date,"_to_", end_date,
                    "_dimensions_election16_",
                    ifelse(remove_news, "remove_news", ""), "_", the_cluster,
                    "_docs_only_replicate_psrm.RData"
                )
            )
            
        }
        
        ## if (the_cluster == "all") {
        
        ##                 system.time(
        ##     scores_election16_pca <- scale_text(
        ##         meta = meta_election16,
        ##         tdm = tdm_election16,
        ##         compress_fast = TRUE,
        ##         constrain_outliers = F,
        ##         simple=T,
        ##         holdout=meta_election16$holdout,
        ##         approx=T,
        ##         ## n_dimension_compression=(ncol(tdm_election16))
        ##         no_truncation = TRUE
        ##     )
        ## )
        
        ## document_scores_election16_pca <- score_documents(
        ##     scores = scores_election16_pca,
        ##     n_dimensions = ifelse(
        ##         the_cluster=="all",
        ##         100,
        ##         10
        ##     ),
        ##     doc_length_correction = 1
        ## )
        
        ## save(
        ##     document_scores_election16_pca,
        ##     file=paste0(
        ##         "data/en_",tweets_or_descriptions,"_",
        ##         start_date,"_to_", end_date,
        ##         "_dimensions_election16_replicate_psrm_",
        ##         ifelse(remove_news, "remove_news", ""), "_", the_cluster,
        ##         "_docs_only_pca_replicate_psrm.RData"
        ##     )
        ## )
        
        ## }
        
        
        ## }
        
        if (start_date < "2016-01-01") {
            mi_vocab_list_2016_v_2015 <- calcMI(
                .tdm=tdm_election16[!meta_election16$holdout,],
                .member=with(
                    meta_election16[!meta_election16$holdout,],
                    as.integer(substr(tweet_time, 1, 10) >= "2016-01-01")
                )
            )
            if (the_cluster!="all") {
                mi_vocab_list_these_trolls_v_others <- calcMI(
                    .tdm=tdm_election16[
                        !meta_election16$same_holdout_all_trolls,
                        ],
                    .member=with(
                        meta_election16[
                            !meta_election16$same_holdout_all_trolls,
                            ],
                        as.integer(userid %in% these_trolls)
                    )
                )
            } else {
                mi_vocab_list_these_trolls_v_others <- NA
            }
            save(
                mi_vocab_list_2016_v_2015,
                mi_vocab_list_these_trolls_v_others,
                file=paste0(
                    "data/en_",tweets_or_descriptions,"_",
                    start_date,"_to_", end_date, "_dimensions_election16_",
                    ifelse(remove_news, "remove_news", ""), "_", the_cluster,
                    "_mi_keywords_replicate_psrm.RData"
                )
            )
        }
    }
    ## }
    
}

print(Sys.time())
